##### ########################################################## ######
#####                                                            ######
#####   Input: Debates data and MP-by-debate data                ######
#####   Output: Run and analysis topic model                     ######
#####                                                            ######
##### ########################################################## ######

library(quanteda) # v3.3.1
library(stm) # v1.3.6.1
library(tidytext) # v0.4.1
library(tidyr) # v1.3.0
library(dplyr) # v1.1.3
library(ggplot2) # v3.4.3
library(data.table) # v1.14.8
library(sandwich) # v3.0-2

# Load data

load("data/debates.Rdata")

# Cut 1992-1997 session & speaker

debates <- debates[debates$parliamentary_term != "1992-1997"]

# Collapse to speaker in debate level

text_by_mp_in_debate <- debates[,list(body = paste0(body, collapse =  " "),
                                      gender = unique(gender), year = unique(year)),
                                by = list(section_id,person_id)]

# Convert text_by_mp_in_debate$body to a corpus object
debate_corpus <- corpus(text_by_mp_in_debate, text_field = "body")

# 2. Convert that corpus object to a dfm, remove stopwords, move to lower case

debate_dfm <- tokens(debate_corpus,
                     remove_punct = TRUE) %>%
  tokens_remove(pattern = stopwords("english")) %>%
  tokens_tolower() %>%
  dfm()

# 3. Trim dfm to remove very infrequent words
debate_dfm_stm  <- dfm_trim(debate_dfm, min_docfreq = .01, max_docfreq = .90, docfreq_type = "prop")

save(debate_dfm_stm, file = "working/stm_out/debate_dfm_stm.Rdata")

# 4. Run stm()
stm_object <- stm(debate_dfm_stm,
                  K = 20,
                  seed = 12345)

save(stm_object, file = "working/stm_out_20.Rdata")

# Load relevant data

load("working/stm_out/debate_dfm_stm.Rdata")
load("working/stm_out/stm_out_20.Rdata")

text_by_mp_in_debate <- debates[,list(body = paste0(body, collapse =  " "),
                                      gender = unique(gender),
                                      year = unique(year),
                                      parliamentary_term = unique(parliamentary_term)),
                                by = list(section_id,person_id)]

# Extra and label topic proportions

topic_proportions <- data.frame(stm_object$theta)
topic_labels <- apply(labelTopics(stm_object)$frex,1, function(x) paste0(x, collapse = "_"))
names(topic_proportions) <- topic_labels

debate_dfm <- data.frame(docvars(debate_dfm_stm)$section_id,
                   docvars(debate_dfm_stm)$person_id)
debate_dfm$section_id <- debate_dfm$docvars.debate_dfm_stm..section_id
debate_dfm$person_id <- debate_dfm$docvars.debate_dfm_stm..person_id

debate_dfm <- debate_dfm[1:418403,c(2:4)]
out <- data.frame(debate_dfm, topic_proportions)
out <- merge(out, text_by_mp_in_debate, by = c("person_id", "section_id"))

# Merge back in with speech_scores

load("working/speech_scores.Rdata")
speech_scores <- merge(out, speech_scores, by = c("person_id","section_id"))

# Drop dead variables

speech_scores$docvars.debate_dfm_stm..person_id <- NULL
speech_scores$gender <- speech_scores$gender.x
speech_scores$gender.x <- NULL
speech_scores$gender.y <- NULL
speech_scores$parliamentary_term <- speech_scores$parliamentary_term.x
speech_scores$parliamentary_term.x <- NULL
speech_scores$parliamentary_term.y <- NULL
speech_scores$year <- speech_scores$year.x
speech_scores$year.x <- NULL
speech_scores$year.y <- NULL

# Calculate topics as proportions

speech_scores$question_answer_asked_put_questions_asking_answers <- speech_scores$question_answer_asked_put_questions_asking_answers*100
speech_scores$get_things_hear_go_lot_going_really <- speech_scores$get_things_hear_go_lot_going_really*100
speech_scores$secretary_state_labour_rose_government.s_failed_conservative <- speech_scores$secretary_state_labour_rose_government.s_failed_conservative*100
speech_scores$public_private_sector_service_beg_voluntary_civil <- speech_scores$public_private_sector_service_beg_voluntary_civil*100
speech_scores$friend_hon_give_gentleman_right_member_lady <- speech_scores$friend_hon_give_gentleman_right_member_lady*100
speech_scores$way_accept_argument_entirely_view_reason_shall <- speech_scores$way_accept_argument_entirely_view_reason_shall*100
speech_scores$mr_minister_statement_prime_inquiry_minister.s_deputy <- speech_scores$mr_minister_statement_prime_inquiry_minister.s_deputy*100
speech_scores$eu_european_union_europe_treaty_brexit_trade <- speech_scores$eu_european_union_europe_treaty_brexit_trade*100
speech_scores$funding_million_cent_rail_per_students_. <- speech_scores$funding_million_cent_rail_per_students_.*100
speech_scores$armed_military_forces_defence_iraq_war_afghanistan <- speech_scores$armed_military_forces_defence_iraq_war_afghanistan*100
speech_scores$councils_local_housing_authorities_residents_planning_london <- speech_scores$councils_local_housing_authorities_residents_planning_london*100
speech_scores$committee_select_motion_vote_electoral_parliament_chamber <- speech_scores$committee_select_motion_vote_electoral_parliament_chamber*100
speech_scores$energy_food_climate_farmers_industry_water_gas <- speech_scores$energy_food_climate_farmers_industry_water_gas*100
speech_scores$tax_pension_income_pensions_chancellor_pensioners_democrats <- speech_scores$tax_pension_income_pensions_chancellor_pensioners_democrats*100
speech_scores$nhs_care_patients_mental_hospital_schools_school <- speech_scores$nhs_care_patients_mental_hospital_schools_school*100
speech_scores$ensure_review_work_steps_progress_ensuring_working <- speech_scores$ensure_review_work_steps_progress_ensuring_working*100
speech_scores$ireland_northern_scottish_scotland_welsh_wales_assembly <- speech_scores$ireland_northern_scottish_scotland_welsh_wales_assembly*100
speech_scores$clause_speaker_amendment_amendments_bill_provisions_court <- speech_scores$clause_speaker_amendment_amendments_bill_provisions_court*100
speech_scores$post_company_bbc_banks_bank_businesses_scheme <- speech_scores$post_company_bbc_banks_bank_businesses_scheme*100
speech_scores$police_crime_prison_women_victims_officers_violence <- speech_scores$police_crime_prison_women_victims_officers_violence*100

# Analysis of seniority and topics

topics <- names(speech_scores)[3:22]

covariate_formula <- "~ years_in_parliament + gender + party + attends_cabinet + age_years + attends_shadow_cabinet + is_gov_minister +
                        is_opp_minister + is_committee_chair +  marginality + gov_mp"

est_conf_ints <- function(x = bivariate_model){

  coefs <- coef(summary(x))
  est <- coefs[,1]
  se <- coefs[,2]
  hi <- est + 1.96 * se
  lo <- est - 1.96 * se
  return(data.frame(est, hi, lo))
}

coef_list <- list()
bivariate_list <- list()
multivariate_list <- list()
i<-0
topic <- "question_answer_asked_put_questions_asking_answers"
for(topic in topics){


  print(topic)
  i <- i+1

  bivariate_model <- lm(as.formula(paste0(topic, "~ years_in_parliament")),
                        data = speech_scores)

  multivariate_model <- lm(as.formula(paste0(topic, covariate_formula)),
                           data = speech_scores)

  out <- list()
  out$bivariate <- est_conf_ints(bivariate_model)
  out$multivariate_model <- est_conf_ints(multivariate_model)

  coef_list[[i]] <- out

}

names(coef_list) <- topic_labels

bivariate_out <- lapply(1:length(coef_list), function(x) {
  tmp <- coef_list[[x]]$bivariate
  tmp$topic <- names(coef_list)[x]
  tmp[rownames(tmp) == "years_in_parliament",]
})

multivariate_out <- lapply(1:length(coef_list), function(x) {
  tmp <- coef_list[[x]]$multivariate_model
  tmp$topic <- names(coef_list)[x]
  tmp[rownames(tmp) == "years_in_parliament",]
})

bivariate_out <- data.frame(do.call("rbind", bivariate_out))
multivariate_out <- data.frame(do.call("rbind", multivariate_out))

bivariate_out$model <- "Bivariate"
multivariate_out$model <- "Multivariate"
out <- rbind(bivariate_out, multivariate_out)
out$model <- factor(out$model, levels = c("Bivariate", "Multivariate"))
out$topic <- factor(out$topic, levels = out$topic[out$model == "Multivariate"][order(out$est[out$model == "Multivariate"])])

pdf("analysis/plots/figure_S4.pdf",10,6)
ggplot(out, aes(x = est, xmin = lo, xmax = hi, y = topic, col = model)) +
  geom_point(position = position_dodge(width=0.5), aes(shape = model, color=model)) +
  geom_errorbarh(height = .001, position = position_dodge(width=0.5)) +
  theme_bw() +
  scale_color_manual(values = c("black", "grey40")) +
  scale_shape_manual(values=c(19,1)) +
  theme(axis.text = element_text(size = 9),
        axis.title = element_text(size = 9),
        legend.title = element_text(size=9),
        legend.text = element_text(size=9)) +
  xlab("Years in Parliament") +
  ylab("") + geom_vline(xintercept = 0, linetype = 2)
dev.off()

# Gender gaps topic model analysis 

covariate_formula <- "~ gender + years_in_parliament + party + attends_cabinet + age_years + attends_shadow_cabinet + is_gov_minister +
                        is_opp_minister + is_committee_chair +  marginality + gov_mp"

est_conf_ints <- function(x = bivariate_model){
  
  coefs <- coef(summary(x))
  est <- coefs[,1]
  se <- coefs[,2]
  hi <- est + 1.96 * se
  lo <- est - 1.96 * se
  return(data.frame(est, hi, lo, se))
}

coef_list <- list()
bivariate_list <- list()
multivariate_list <- list()
i<-0
topic <- "question_answer_asked_put_questions_asking_answers"
for(topic in topics){
  
  
  print(topic)
  i <- i+1
  
  bivariate_model <- lm(as.formula(paste0(topic, "~ gender")),
                        data = speech_scores)
  
  multivariate_model <- lm(as.formula(paste0(topic, covariate_formula)),
                           data = speech_scores)
  
  out <- list()
  out$bivariate <- est_conf_ints(bivariate_model)
  out$multivariate_model <- est_conf_ints(multivariate_model)
  
  coef_list[[i]] <- out
  
}

names(coef_list) <- topic_labels

bivariate_out <- lapply(1:length(coef_list), function(x) {
  tmp <- coef_list[[x]]$bivariate
  tmp$topic <- names(coef_list)[x]
  tmp[rownames(tmp) == "genderFemale",]
})

multivariate_out <- lapply(1:length(coef_list), function(x) {
  tmp <- coef_list[[x]]$multivariate_model
  tmp$topic <- names(coef_list)[x]
  tmp[rownames(tmp) == "genderFemale",]
})

bivariate_out <- data.frame(do.call("rbind", bivariate_out))
multivariate_out <- data.frame(do.call("rbind", multivariate_out))

bivariate_out$model <- "Bivariate"
multivariate_out$model <- "Multivariate"
out <- rbind(multivariate_out)
out$model <- factor(out$model, levels = c("Multivariate"))
out$topic <- factor(out$topic, levels = out$topic[out$model == "Multivariate"][order(out$est[out$model == "Multivariate"])])
out$sig <- ifelse(abs(out$est / out$se) >= 1.96, TRUE, FALSE)

pdf("analysis/plots/figure_S6.pdf",10,6)
ggplot(out, aes(x = est, xmin = lo, xmax = hi, y = topic)) +
  geom_point(shape=20, size=3, aes(col=ifelse(sig=="TRUE", "black", "grey80"))) +
  geom_errorbarh(height = .001, aes(col=ifelse(sig=="TRUE", "black", "grey80"))) +
  theme_bw() +
  scale_color_manual(values = c("black", "grey80")) +
  theme(axis.text.y = element_text(size = 9),
        axis.title = element_text(size = 9),
        legend.position = "none") +
  xlab("Woman MP difference") +
  ylab("") + geom_vline(xintercept = 0, linetype = 2)
dev.off()
